Lab 03 - Regular Expressions

Courtney Stowers

Library

# Load library
library(tidyverse)
library(stringi)

Data

# IMDB Top 250 Lists and 5000 plus IMDB records
# https://data.world/studentoflife/imdb-top-250-lists-and-5000-or-so-data-records
df <- read.csv("https://query.data.world/s/rr46ndg7fyne54q7oonmvzxbaxg3zn", header=TRUE, stringsAsFactors=FALSE)

# View column names
colnames(df)
##  [1] "Title"          "Year"           "Rated"          "Released"      
##  [5] "Runtime"        "Genre"          "Director"       "Writer"        
##  [9] "Actors"         "Plot"           "Language"       "Country"       
## [13] "Awards"         "Poster"         "Ratings.Source" "Ratings.Value" 
## [17] "Metascore"      "imdbRating"     "imdbVotes"      "imdbID"        
## [21] "Type"           "DVD"            "BoxOffice"      "Production"    
## [25] "Website"        "Response"       "tomatoURL"

GREP & GREPL

# Find directors with Courtney in name
grep("Courtney", df$Director, value = TRUE, ignore.case = TRUE)
## [1] "Courtney Solomon" "Courtney Solomon" "Courtney Hunt"
# Find writers with Courtney in name
grep("Courtney", df$Writer, value = TRUE, ignore.case = TRUE)
## [1] "Brent Monahan (novel), Courtney Solomon"                                                                                     
## [2] "Irwin Yablans (story), C. Courtney Joyner (screenplay)"                                                                      
## [3] "C. Courtney Joyner, Mike Malone (additional story material), Darin Scott, Jeff Burr, Mike Malone (additional story material)"
## [4] "Courtney Hunt"
# Find lists of actors where there is an actor with Courtney in their name
grep("Courtney", df$Actors, value = TRUE, ignore.case = TRUE)
##  [1] "Scott 'Carrot Top' Thompson, Courtney Thorne-Smith, Larry Miller, Raquel Welch"
##  [2] "Arnold Schwarzenegger, Jason Clarke, Emilia Clarke, Jai Courtney"              
##  [3] "Brenton Thwaites, John Samaha, Courtney Eaton, Nikolaj Coster-Waldau"          
##  [4] "Kate Winslet, Jai Courtney, Mekhi Phifer, Shailene Woodley"                    
##  [5] "Bruce Willis, Jai Courtney, Sebastian Koch, Mary Elizabeth Winstead"           
##  [6] "Shailene Woodley, Theo James, Ashley Judd, Jai Courtney"                       
##  [7] "Joel Courtney, Jessica Tuck, Joel McKinnon Miller, Ryan Lee"                   
##  [8] "Sandra Bullock, Julian McMahon, Shyann McClure, Courtney Taylor Burness"       
##  [9] "Milla Jovovich, Brian Krause, Lisa Pelikan, Courtney Barilla"                  
## [10] "Elijah Wood, Courtney B. Vance, Robbie Coltrane, Jason Robards"                
## [11] "Lauren German, Michael Biehn, Milo Ventimiglia, Courtney B. Vance"             
## [12] "Heather Sossaman, Matthew Bohrer, Courtney Halverson, Shelley Hennig"          
## [13] "Catherine Parker, Courtney Bell, Dave Levine, Justin Gordon"
# Create cast list
courtney_cast <- paste(grep("Courtney", df$Actors, value = TRUE, ignore.case = TRUE), collapse=', ') %>% str_split(", ")

courtney_cast
## [[1]]
##  [1] "Scott 'Carrot Top' Thompson" "Courtney Thorne-Smith"      
##  [3] "Larry Miller"                "Raquel Welch"               
##  [5] "Arnold Schwarzenegger"       "Jason Clarke"               
##  [7] "Emilia Clarke"               "Jai Courtney"               
##  [9] "Brenton Thwaites"            "John Samaha"                
## [11] "Courtney Eaton"              "Nikolaj Coster-Waldau"      
## [13] "Kate Winslet"                "Jai Courtney"               
## [15] "Mekhi Phifer"                "Shailene Woodley"           
## [17] "Bruce Willis"                "Jai Courtney"               
## [19] "Sebastian Koch"              "Mary Elizabeth Winstead"    
## [21] "Shailene Woodley"            "Theo James"                 
## [23] "Ashley Judd"                 "Jai Courtney"               
## [25] "Joel Courtney"               "Jessica Tuck"               
## [27] "Joel McKinnon Miller"        "Ryan Lee"                   
## [29] "Sandra Bullock"              "Julian McMahon"             
## [31] "Shyann McClure"              "Courtney Taylor Burness"    
## [33] "Milla Jovovich"              "Brian Krause"               
## [35] "Lisa Pelikan"                "Courtney Barilla"           
## [37] "Elijah Wood"                 "Courtney B. Vance"          
## [39] "Robbie Coltrane"             "Jason Robards"              
## [41] "Lauren German"               "Michael Biehn"              
## [43] "Milo Ventimiglia"            "Courtney B. Vance"          
## [45] "Heather Sossaman"            "Matthew Bohrer"             
## [47] "Courtney Halverson"          "Shelley Hennig"             
## [49] "Catherine Parker"            "Courtney Bell"              
## [51] "Dave Levine"                 "Justin Gordon"
# Convert to character vector
courtney_cast <- courtney_cast %>% unlist()
courtney_cast
##  [1] "Scott 'Carrot Top' Thompson" "Courtney Thorne-Smith"      
##  [3] "Larry Miller"                "Raquel Welch"               
##  [5] "Arnold Schwarzenegger"       "Jason Clarke"               
##  [7] "Emilia Clarke"               "Jai Courtney"               
##  [9] "Brenton Thwaites"            "John Samaha"                
## [11] "Courtney Eaton"              "Nikolaj Coster-Waldau"      
## [13] "Kate Winslet"                "Jai Courtney"               
## [15] "Mekhi Phifer"                "Shailene Woodley"           
## [17] "Bruce Willis"                "Jai Courtney"               
## [19] "Sebastian Koch"              "Mary Elizabeth Winstead"    
## [21] "Shailene Woodley"            "Theo James"                 
## [23] "Ashley Judd"                 "Jai Courtney"               
## [25] "Joel Courtney"               "Jessica Tuck"               
## [27] "Joel McKinnon Miller"        "Ryan Lee"                   
## [29] "Sandra Bullock"              "Julian McMahon"             
## [31] "Shyann McClure"              "Courtney Taylor Burness"    
## [33] "Milla Jovovich"              "Brian Krause"               
## [35] "Lisa Pelikan"                "Courtney Barilla"           
## [37] "Elijah Wood"                 "Courtney B. Vance"          
## [39] "Robbie Coltrane"             "Jason Robards"              
## [41] "Lauren German"               "Michael Biehn"              
## [43] "Milo Ventimiglia"            "Courtney B. Vance"          
## [45] "Heather Sossaman"            "Matthew Bohrer"             
## [47] "Courtney Halverson"          "Shelley Hennig"             
## [49] "Catherine Parker"            "Courtney Bell"              
## [51] "Dave Levine"                 "Justin Gordon"
# Filter out cast list to only keep actors with Courtney in their name
courtney_actors <- grep("Courtney", courtney_cast, value = TRUE, ignore.case = TRUE) %>% unique()

courtney_actors
## [1] "Courtney Thorne-Smith"   "Jai Courtney"           
## [3] "Courtney Eaton"          "Joel Courtney"          
## [5] "Courtney Taylor Burness" "Courtney Barilla"       
## [7] "Courtney B. Vance"       "Courtney Halverson"     
## [9] "Courtney Bell"
# Find actors with Courtney as first name
grep("^Courtney", courtney_actors, value = TRUE, ignore.case = TRUE)
## [1] "Courtney Thorne-Smith"   "Courtney Eaton"         
## [3] "Courtney Taylor Burness" "Courtney Barilla"       
## [5] "Courtney B. Vance"       "Courtney Halverson"     
## [7] "Courtney Bell"
# Find actors with Courtney as last name
grep("Courtney$", courtney_actors, value = TRUE, ignore.case = TRUE)
## [1] "Jai Courtney"  "Joel Courtney"
# Pull all actors
actors <- df$Actors %>% str_split(", ") %>% unlist() %>% str_trim() %>% unique()

actors %>% head(20)
##  [1] "Cem Kurtoglu"         "Hakan Ural"           "Hazim Körmükçü"      
##  [4] "Tolga Karel"          "Kirk Cameron"         "Darren Doane"        
##  [7] "Bridgette Cameron"    "Ben Kientz"           "Jon Voight"          
## [10] "Scott Baio"           "Vanessa Angel"        "Skyler Shaye"        
## [13] "Daniel Küblböck"      "Ulli Lommel"          "Rudolf Waldemar Brem"
## [16] "Katja Rupé"           "Tom Neyman"           "John Reynolds"       
## [19] "Diane Adelson"        "Harold P. Warren"
# Try to find actors whose first name starts with C and end with Y, we can see there's a few instances where the first name doesn't actually end with Y (Caleb Landry Jones & Charles Henry Wyson )
actors[(grepl("^C", actors, ignore.case = TRUE) & grepl("y\\s", actors, ignore.case = TRUE)) & !grepl("C\\.", actors, ignore.case = TRUE)] %>% sort()
##  [1] "Caity Lotz"              "Caleb Landry Jones"     
##  [3] "Caley Hayes"             "Candy Ford"             
##  [5] "Carey Crim"              "Carey Lowell"           
##  [7] "Carey Means"             "Carey Mulligan"         
##  [9] "Carly Nahon"             "Carly Schroeder"        
## [11] "Cary Elwes"              "Cary Grant"             
## [13] "Casey Affleck"           "Casey Dubois"           
## [15] "Casey Fallo"             "Casey Gooden"           
## [17] "Casey Groves"            "Casey Hooper"           
## [19] "Casey Twenter"           "Cassidy Gifford"        
## [21] "Cathy Meils"             "Cathy Moriarty"         
## [23] "Chaney Kley"             "Charles Henry Wyson"    
## [25] "Charley Grapewin"        "Chauncey Leopardi"      
## [27] "Chelsey Reist"           "Cherry Jones"           
## [29] "Chevy Chase"             "Christy Chung"          
## [31] "Christy Lighthouse"      "Cicely Tyson"           
## [33] "Cindy Butler"            "Cindy Karr"             
## [35] "Cindy Manion"            "Clancy Brown"           
## [37] "Clemency Burton-Hill"    "Cody Horn"              
## [39] "Cody Howard"             "Cody Linley"            
## [41] "Cody McMains"            "Colby French"           
## [43] "Corey Burton"            "Corey Feldman"          
## [45] "Corey Haim"              "Corey Hawkins"          
## [47] "Corey Johnson"           "Corey Moosa"            
## [49] "Corey Sevier"            "Corey Stoll"            
## [51] "Cory Fernandez"          "Cory Hardrict"          
## [53] "Cory Hodges"             "Cory Monteith"          
## [55] "Courteney Cox"           "Courtney B. Vance"      
## [57] "Courtney Barilla"        "Courtney Bell"          
## [59] "Courtney Eaton"          "Courtney Halverson"     
## [61] "Courtney Taylor Burness" "Courtney Thorne-Smith"
# Find actors whose last name starts with C and end with Y, we can see for example Larry the Cable Guy & John C. McGinley don't actually start with C
actors[(grepl("\\sC", actors, ignore.case = TRUE) & grepl("y$", actors, ignore.case = TRUE))]
##  [1] "Jonathan Cherry"        "Mariah Carey"           "Bill Cosby"            
##  [4] "Tom Courtenay"          "Kevin Casey"            "John C. McGinley"      
##  [7] "Wendell Corey"          "Jennifer Connelly"      "Jim Carrey"            
## [10] "Sean Connery"           "Christopher Carley"     "Walter Connolly"       
## [13] "Joyce Carey"            "Mithun Chakraborty"     "Alok Chakravarty"      
## [16] "Raj Singh Chaudhary"    "Tota Roy Chowdhury"     "Sabyasachi Chakraborty"
## [19] "Aashish Chaudhary"      "Mahima Chaudhry"        "Larry the Cable Guy"   
## [22] "Billy Connolly"         "Sharlto Copley"         "George Clooney"        
## [25] "Raffey Cassidy"         "John C. Reilly"         "Jai Courtney"          
## [28] "Jim Conroy"             "Art Carney"             "Henry Czerny"          
## [31] "Tim Conway"             "Joel Courtney"          "Tim Curry"             
## [34] "Chandler Canterbury"    "Jonathan Chan-Pensley"  "Andrew Dice Clay"      
## [37] "Rob Corddry"            "Reeve Carney"           "Emma Cleasby"          
## [40] "Kristen Connolly"       "Chris Coy"              "Cab Calloway"          
## [43] "Darryl Cooksey"         "John Candy"             "Annie Corley"          
## [46] "Frances Conroy"         "Dana Carvey"            "Katie Cassidy"         
## [49] "Jarlath Conroy"         "Anna Chlumsky"          "Elaine Cassidy"        
## [52] "Ellie Chidzey"          "Anne Consigny"          "Jack Conley"           
## [55] "Julia Chantrey"         "Denise Crosby"          "Don 'D.C.' Curry"      
## [58] "Julius Carry"           "Babou Ceesay"           "Mary Crosby"           
## [61] "Matt Czuchry"           "Jeff Conaway"           "Chloe Csengery"        
## [64] "Reg E. Cathey"          "Sarita Choudhury"       "David Connolly"        
## [67] "Noam Chomsky"           "Christopher Curry"      "Jake Cherry"           
## [70] "Robert Clohessy"        "Joanna Cassidy"         "Gary Conway"           
## [73] "Katie Cleary"
# Find actors whose first OR last name starts with C and end with Y, issues above still exist
actors[(grepl("^C", actors, ignore.case = TRUE) & grepl("y\\s", actors, ignore.case = TRUE) & !grepl("C\\.", actors, ignore.case = TRUE)) | (grepl("\\sC", actors, ignore.case = TRUE) & grepl("y$", actors, ignore.case = TRUE))] %>% sort()
##   [1] "Aashish Chaudhary"       "Alok Chakravarty"       
##   [3] "Andrew Dice Clay"        "Anna Chlumsky"          
##   [5] "Anne Consigny"           "Annie Corley"           
##   [7] "Art Carney"              "Babou Ceesay"           
##   [9] "Bill Cosby"              "Billy Connolly"         
##  [11] "Cab Calloway"            "Caity Lotz"             
##  [13] "Caleb Landry Jones"      "Caley Hayes"            
##  [15] "Candy Ford"              "Carey Crim"             
##  [17] "Carey Lowell"            "Carey Means"            
##  [19] "Carey Mulligan"          "Carly Nahon"            
##  [21] "Carly Schroeder"         "Cary Elwes"             
##  [23] "Cary Grant"              "Casey Affleck"          
##  [25] "Casey Dubois"            "Casey Fallo"            
##  [27] "Casey Gooden"            "Casey Groves"           
##  [29] "Casey Hooper"            "Casey Twenter"          
##  [31] "Cassidy Gifford"         "Cathy Meils"            
##  [33] "Cathy Moriarty"          "Chandler Canterbury"    
##  [35] "Chaney Kley"             "Charles Henry Wyson"    
##  [37] "Charley Grapewin"        "Chauncey Leopardi"      
##  [39] "Chelsey Reist"           "Cherry Jones"           
##  [41] "Chevy Chase"             "Chloe Csengery"         
##  [43] "Chris Coy"               "Christopher Carley"     
##  [45] "Christopher Curry"       "Christy Chung"          
##  [47] "Christy Lighthouse"      "Cicely Tyson"           
##  [49] "Cindy Butler"            "Cindy Karr"             
##  [51] "Cindy Manion"            "Clancy Brown"           
##  [53] "Clemency Burton-Hill"    "Cody Horn"              
##  [55] "Cody Howard"             "Cody Linley"            
##  [57] "Cody McMains"            "Colby French"           
##  [59] "Corey Burton"            "Corey Feldman"          
##  [61] "Corey Haim"              "Corey Hawkins"          
##  [63] "Corey Johnson"           "Corey Moosa"            
##  [65] "Corey Sevier"            "Corey Stoll"            
##  [67] "Cory Fernandez"          "Cory Hardrict"          
##  [69] "Cory Hodges"             "Cory Monteith"          
##  [71] "Courteney Cox"           "Courtney B. Vance"      
##  [73] "Courtney Barilla"        "Courtney Bell"          
##  [75] "Courtney Eaton"          "Courtney Halverson"     
##  [77] "Courtney Taylor Burness" "Courtney Thorne-Smith"  
##  [79] "Dana Carvey"             "Darryl Cooksey"         
##  [81] "David Connolly"          "Denise Crosby"          
##  [83] "Don 'D.C.' Curry"        "Elaine Cassidy"         
##  [85] "Ellie Chidzey"           "Emma Cleasby"           
##  [87] "Frances Conroy"          "Gary Conway"            
##  [89] "George Clooney"          "Henry Czerny"           
##  [91] "Jack Conley"             "Jai Courtney"           
##  [93] "Jake Cherry"             "Jarlath Conroy"         
##  [95] "Jeff Conaway"            "Jennifer Connelly"      
##  [97] "Jim Carrey"              "Jim Conroy"             
##  [99] "Joanna Cassidy"          "Joel Courtney"          
## [101] "John C. McGinley"        "John C. Reilly"         
## [103] "John Candy"              "Jonathan Chan-Pensley"  
## [105] "Jonathan Cherry"         "Joyce Carey"            
## [107] "Julia Chantrey"          "Julius Carry"           
## [109] "Katie Cassidy"           "Katie Cleary"           
## [111] "Kevin Casey"             "Kristen Connolly"       
## [113] "Larry the Cable Guy"     "Mahima Chaudhry"        
## [115] "Mariah Carey"            "Mary Crosby"            
## [117] "Matt Czuchry"            "Mithun Chakraborty"     
## [119] "Noam Chomsky"            "Raffey Cassidy"         
## [121] "Raj Singh Chaudhary"     "Reeve Carney"           
## [123] "Reg E. Cathey"           "Rob Corddry"            
## [125] "Robert Clohessy"         "Sabyasachi Chakraborty" 
## [127] "Sarita Choudhury"        "Sean Connery"           
## [129] "Sharlto Copley"          "Tim Conway"             
## [131] "Tim Curry"               "Tom Courtenay"          
## [133] "Tota Roy Chowdhury"      "Walter Connolly"        
## [135] "Wendell Corey"
# Split actors' names into list
actors_list <- actors %>% str_split(" ")

# Convert to matrix
actors_matrix <- stri_list2matrix(actors_list, byrow=TRUE)

# Convert to data frame, set name columns
actors_df <- as.data.frame(actors_matrix)
colnames(actors_df) <- c("name1", "name2", "name3", "name4")

# Create column with name re-joined
actors_df$name <- paste(coalesce(actors_df$name1, ""), coalesce(actors_df$name2, ""), coalesce(actors_df$name3, ""), coalesce(actors_df$name4, ""), sep = " ") %>% str_trim()

# View data frame
actors_df[order(actors_df$name1),] %>% head(20)
##          name1          name2    name3 name4                 name
## 7194    'Weird            Al' Yankovic  <NA>  'Weird Al' Yankovic
## 3050        50           Cent     <NA>  <NA>              50 Cent
## 4089        A.        Russell  Andrews  <NA>   A. Russell Andrews
## 7811      A.D.          Miles     <NA>  <NA>           A.D. Miles
## 4480      A.J.           Cook     <NA>  <NA>            A.J. Cook
## 4726      A.J.         Langer     <NA>  <NA>          A.J. Langer
## 8032      A.J.        Buckley     <NA>  <NA>         A.J. Buckley
## 9826      A.J.        DeLucia     <NA>  <NA>         A.J. DeLucia
## 3975   Aaliyah           <NA>     <NA>  <NA>              Aaliyah
## 1161     Aamir           Khan     <NA>  <NA>           Aamir Khan
## 3343     Aaran         Thomas     <NA>  <NA>         Aaran Thomas
## 402      Aaron        Eckhart     <NA>  <NA>        Aaron Eckhart
## 2164     Aaron           Paul     <NA>  <NA>           Aaron Paul
## 3101     Aaron           Kwok     <NA>  <NA>           Aaron Kwok
## 3282     Aaron Taylor-Johnson     <NA>  <NA> Aaron Taylor-Johnson
## 3920     Aaron            Yoo     <NA>  <NA>            Aaron Yoo
## 4677     Aaron         Murphy     <NA>  <NA>         Aaron Murphy
## 9345     Aaron          Ruell     <NA>  <NA>          Aaron Ruell
## 9717     Aaron       Stanford     <NA>  <NA>       Aaron Stanford
## 8812 Aasheekaa        Bathija     <NA>  <NA>    Aasheekaa Bathija
# Check each individual name for pattern
actors_df$name1_match <- grepl("C*y$", actors_df$name1, ignore.case = TRUE) & grepl("^C", actors_df$name1, ignore.case = TRUE)
actors_df$name2_match <- grepl("C*y$", actors_df$name2, ignore.case = TRUE) & grepl("^C", actors_df$name2, ignore.case = TRUE)
actors_df$name3_match <- grepl("C*y$", actors_df$name3, ignore.case = TRUE) & grepl("^C", actors_df$name3, ignore.case = TRUE)
actors_df$name4_match <- grepl("C*y$", actors_df$name4, ignore.case = TRUE) & grepl("^C", actors_df$name4, ignore.case = TRUE)

# Filter to keep matching names
actors_df <- actors_df %>% filter(name1_match == TRUE | name2_match == TRUE | name3_match == TRUE | name4_match == TRUE)

# View names, we see those outliers are removed below
actors_df$name %>% sort() %>% unique()
##   [1] "Aashish Chaudhary"       "Alok Chakravarty"       
##   [3] "Andrew Dice Clay"        "Anna Chlumsky"          
##   [5] "Anne Consigny"           "Annie Corley"           
##   [7] "Art Carney"              "Babou Ceesay"           
##   [9] "Bill Cosby"              "Billy Connolly"         
##  [11] "Cab Calloway"            "Caity Lotz"             
##  [13] "Caley Hayes"             "Candy Ford"             
##  [15] "Carey Crim"              "Carey Lowell"           
##  [17] "Carey Means"             "Carey Mulligan"         
##  [19] "Carly Nahon"             "Carly Schroeder"        
##  [21] "Cary Elwes"              "Cary Grant"             
##  [23] "Casey Affleck"           "Casey Dubois"           
##  [25] "Casey Fallo"             "Casey Gooden"           
##  [27] "Casey Groves"            "Casey Hooper"           
##  [29] "Casey Twenter"           "Cassidy Gifford"        
##  [31] "Cathy Meils"             "Cathy Moriarty"         
##  [33] "Chandler Canterbury"     "Chaney Kley"            
##  [35] "Charley Grapewin"        "Chauncey Leopardi"      
##  [37] "Chelsey Reist"           "Cherry Jones"           
##  [39] "Chevy Chase"             "Chloe Csengery"         
##  [41] "Chris Coy"               "Christopher Carley"     
##  [43] "Christopher Curry"       "Christy Chung"          
##  [45] "Christy Lighthouse"      "Cicely Tyson"           
##  [47] "Cindy Butler"            "Cindy Karr"             
##  [49] "Cindy Manion"            "Clancy Brown"           
##  [51] "Clemency Burton-Hill"    "Cody Horn"              
##  [53] "Cody Howard"             "Cody Linley"            
##  [55] "Cody McMains"            "Colby French"           
##  [57] "Corey Burton"            "Corey Feldman"          
##  [59] "Corey Haim"              "Corey Hawkins"          
##  [61] "Corey Johnson"           "Corey Moosa"            
##  [63] "Corey Sevier"            "Corey Stoll"            
##  [65] "Cory Fernandez"          "Cory Hardrict"          
##  [67] "Cory Hodges"             "Cory Monteith"          
##  [69] "Courteney Cox"           "Courtney B. Vance"      
##  [71] "Courtney Barilla"        "Courtney Bell"          
##  [73] "Courtney Eaton"          "Courtney Halverson"     
##  [75] "Courtney Taylor Burness" "Courtney Thorne-Smith"  
##  [77] "Dana Carvey"             "Darryl Cooksey"         
##  [79] "David Connolly"          "Denise Crosby"          
##  [81] "Don 'D.C.' Curry"        "Elaine Cassidy"         
##  [83] "Ellie Chidzey"           "Emma Cleasby"           
##  [85] "Frances Conroy"          "Gary Conway"            
##  [87] "George Clooney"          "Henry Czerny"           
##  [89] "Jack Conley"             "Jai Courtney"           
##  [91] "Jake Cherry"             "Jarlath Conroy"         
##  [93] "Jeff Conaway"            "Jennifer Connelly"      
##  [95] "Jim Carrey"              "Jim Conroy"             
##  [97] "Joanna Cassidy"          "Joel Courtney"          
##  [99] "John Candy"              "Jonathan Chan-Pensley"  
## [101] "Jonathan Cherry"         "Joyce Carey"            
## [103] "Julia Chantrey"          "Julius Carry"           
## [105] "Katie Cassidy"           "Katie Cleary"           
## [107] "Kevin Casey"             "Kristen Connolly"       
## [109] "Mahima Chaudhry"         "Mariah Carey"           
## [111] "Mary Crosby"             "Matt Czuchry"           
## [113] "Mithun Chakraborty"      "Noam Chomsky"           
## [115] "Raffey Cassidy"          "Raj Singh Chaudhary"    
## [117] "Reeve Carney"            "Reg E. Cathey"          
## [119] "Rob Corddry"             "Robert Clohessy"        
## [121] "Sabyasachi Chakraborty"  "Sarita Choudhury"       
## [123] "Sean Connery"            "Sharlto Copley"         
## [125] "Tim Conway"              "Tim Curry"              
## [127] "Tom Courtenay"           "Tota Roy Chowdhury"     
## [129] "Walter Connolly"         "Wendell Corey"

GSUB & SUB

# Find actors with quotation mark (') surrounding name
# https://javascript.info/regexp-greedy-and-lazy
# https://www.rexegg.com/regex-quantifiers.php
# Greedy gives longest match, lazy gives shortest match, here we return all values that have a match
grep("'.*?'", actors, value = TRUE, ignore.case = TRUE)
##  [1] "Gary 'G. Thang' Johnson"       "Scott 'Carrot Top' Thompson"  
##  [3] "Cesáreo Quezadas 'Pulgarcito'" "José Luis Aguirre 'Trotsky'"  
##  [5] "Tung Cho 'Joe' Cheung"         "Oliver 'Ole' Zemen"           
##  [7] "Michael 'Xeno' Langebeck"      "Joanna 'JoJo' Levesque"       
##  [9] "Don 'D.C.' Curry"              "Eddie 'Piolin' Sotelo"        
## [11] "'Weird Al' Yankovic"           "George 'Buck' Flower"         
## [13] "Tommy 'Tiny' Lister"           "Will 'Spank' Horton"          
## [15] "Mike 'The Miz' Mizanin"        "Yousef 'Joe' Sweid"           
## [17] "Stephanie 'Stevvi' Alexander"  "Julie 'Jules' Urich"          
## [19] "Wilbur 'Hi-Fi' White"          "Chris 'Wonder' Schoeck"
quotation_actors <- grep("'.*?'", actors, value = TRUE, ignore.case = TRUE)

# Replaces all
gsub("'", "(", quotation_actors)
##  [1] "Gary (G. Thang( Johnson"       "Scott (Carrot Top( Thompson"  
##  [3] "Cesáreo Quezadas (Pulgarcito(" "José Luis Aguirre (Trotsky("  
##  [5] "Tung Cho (Joe( Cheung"         "Oliver (Ole( Zemen"           
##  [7] "Michael (Xeno( Langebeck"      "Joanna (JoJo( Levesque"       
##  [9] "Don (D.C.( Curry"              "Eddie (Piolin( Sotelo"        
## [11] "(Weird Al( Yankovic"           "George (Buck( Flower"         
## [13] "Tommy (Tiny( Lister"           "Will (Spank( Horton"          
## [15] "Mike (The Miz( Mizanin"        "Yousef (Joe( Sweid"           
## [17] "Stephanie (Stevvi( Alexander"  "Julie (Jules( Urich"          
## [19] "Wilbur (Hi-Fi( White"          "Chris (Wonder( Schoeck"
# Replaces the first one
sub("'", "(", quotation_actors)
##  [1] "Gary (G. Thang' Johnson"       "Scott (Carrot Top' Thompson"  
##  [3] "Cesáreo Quezadas (Pulgarcito'" "José Luis Aguirre (Trotsky'"  
##  [5] "Tung Cho (Joe' Cheung"         "Oliver (Ole' Zemen"           
##  [7] "Michael (Xeno' Langebeck"      "Joanna (JoJo' Levesque"       
##  [9] "Don (D.C.' Curry"              "Eddie (Piolin' Sotelo"        
## [11] "(Weird Al' Yankovic"           "George (Buck' Flower"         
## [13] "Tommy (Tiny' Lister"           "Will (Spank' Horton"          
## [15] "Mike (The Miz' Mizanin"        "Yousef (Joe' Sweid"           
## [17] "Stephanie (Stevvi' Alexander"  "Julie (Jules' Urich"          
## [19] "Wilbur (Hi-Fi' White"          "Chris (Wonder' Schoeck"
# Replace first
parenthesis_actors <- sub("'", "(", quotation_actors)

# Replace second
parenthesis_actors<- sub("'", ")", parenthesis_actors)

parenthesis_actors
##  [1] "Gary (G. Thang) Johnson"       "Scott (Carrot Top) Thompson"  
##  [3] "Cesáreo Quezadas (Pulgarcito)" "José Luis Aguirre (Trotsky)"  
##  [5] "Tung Cho (Joe) Cheung"         "Oliver (Ole) Zemen"           
##  [7] "Michael (Xeno) Langebeck"      "Joanna (JoJo) Levesque"       
##  [9] "Don (D.C.) Curry"              "Eddie (Piolin) Sotelo"        
## [11] "(Weird Al) Yankovic"           "George (Buck) Flower"         
## [13] "Tommy (Tiny) Lister"           "Will (Spank) Horton"          
## [15] "Mike (The Miz) Mizanin"        "Yousef (Joe) Sweid"           
## [17] "Stephanie (Stevvi) Alexander"  "Julie (Jules) Urich"          
## [19] "Wilbur (Hi-Fi) White"          "Chris (Wonder) Schoeck"

Finding Numbers

# Find numbers
grep("[0-9]", df$Title, value = TRUE, ignore.case = TRUE) %>% head(20)
##  [1] "Superbabies: Baby Geniuses 2"                
##  [2] "Birdemic 2: The Resurrection"                
##  [3] "Dracula 3000"                                
##  [4] "Leonard Part 6"                              
##  [5] "Kyaa Kool Hain Hum 3"                        
##  [6] "Lawnmower Man 2: Beyond Cyberspace"          
##  [7] "Car 54, Where Are You?"                      
##  [8] "12 Angry Men"                                
##  [9] "Se7en"                                       
## [10] "Terminator 2: Judgment Day"                  
## [11] "Toy Story 3"                                 
## [12] "2001: A Space Odyssey"                       
## [13] "Kill Bill: Vol. 1"                           
## [14] "The Legend of 1900"                          
## [15] "12 Years a Slave"                            
## [16] "Harry Potter and the Deathly Hallows: Part 2"
## [17] "Guardians of the Galaxy Vol. 2"              
## [18] "Stalag 17"                                   
## [19] "Short Term 12"                               
## [20] "3 Idiots"
# Replace non-digits with empty string, format digits as numbers
as.numeric(gsub("\\D", "", df$Title))[!is.na(as.numeric(gsub("\\D", "", df$Title)))] %>% unique()
##  [1]     2  3000     6     3    54    12     7  2001     1  1900    17    26
## [13]    24   420    20  9211    10    21   400     8  2012    47     4    13
## [25]   300   123    80    19   102  2000    22    50   571    49    51    44
## [37]   310    16    28    15    42     5    40    23  1941     9    27  1330
## [49]  3313    30    88  1911    39    33  1408    93   127  4040    55  2046
## [61]   911    43  1974    60 40000  5050   500    46  1000   200  1114   786
## [73]   100 20000    18    90    25  1776   213  1984    72  2016    81   247
## [85]    66    41   132  1838  1982   432    70
grep("\\sone\\s|\\II|\\sthree\\s|\\sIV\\s", df$Title, value = TRUE, ignore.case = TRUE)
##  [1] "Dark Harvest II: The Maize"                             
##  [2] "Boggy Creek II: And the Legend Continues"               
##  [3] "The Godfather: Part II"                                 
##  [4] "Star Wars: Episode IV - A New Hope"                     
##  [5] "It Happened One Night"                                  
##  [6] "No One Killed Jessica"                                  
##  [7] "Mission: Impossible III"                                
##  [8] "Men in Black II"                                        
##  [9] "Bad Boys II"                                            
## [10] "Harry Potter and the Deathly Hallows: Part II"          
## [11] "Mission: Impossible II"                                 
## [12] "Star Wars: Episode III - Revenge of the Sith"           
## [13] "Star Wars: Episode II - Attack of the Clones"           
## [14] "Jurassic Park III"                                      
## [15] "The Three Musketeers"                                   
## [16] "Nutty Professor II: The Klumps"                         
## [17] "Hellboy II: The Golden Army"                            
## [18] "The Hangover Part II"                                   
## [19] "Rambo III"                                              
## [20] "Superman II"                                            
## [21] "Blade II"                                               
## [22] "The Godfather: Part III"                                
## [23] "Beverly Hills Cop III"                                  
## [24] "Rambo: First Blood Part II"                             
## [25] "Back to the Future Part II"                             
## [26] "Back to the Future Part III"                            
## [27] "Superman III"                                           
## [28] "The Three Stooges"                                      
## [29] "The Next Three Days"                                    
## [30] "Teenage Mutant Ninja Turtles II: The Secret of the Ooze"
## [31] "Jeepers Creepers II"                                    
## [32] "Teenage Mutant Ninja Turtles III"                       
## [33] "Beverly Hills Cop II"                                   
## [34] "Les couloirs du temps: Les visiteurs II"                
## [35] "Star Trek III: The Search for Spock"                    
## [36] "Halloween II"                                           
## [37] "The Hills Have Eyes II"                                 
## [38] "Crocodile Dundee II"                                    
## [39] "Exorcist II: The Heretic"                               
## [40] "Star Trek II: The Wrath of Khan"                        
## [41] "A Tale of Three Cities"                                 
## [42] "Saw III"                                                
## [43] "Atlas Shrugged II: The Strike"                          
## [44] "Poltergeist III"                                        
## [45] "Richard III"                                            
## [46] "The Boondock Saints II: All Saints Day"                 
## [47] "Hostel: Part II"                                        
## [48] "The Work and the Glory II: American Zion"               
## [49] "Saw II"                                                 
## [50] "Clerks II"                                              
## [51] "Friday the 13th Part VIII: Jason Takes Manhattan"       
## [52] "The Last Exorcism Part II"                              
## [53] "Menace II Society"                                      
## [54] "Evil Dead II"                                           
## [55] "Phantasm II"                                            
## [56] "Thirteen Conversations About One Thing"                 
## [57] "Friday the 13th Part VII: The New Blood"                
## [58] "Halloween III: Season of the Witch"                     
## [59] "The Toxic Avenger Part II"                              
## [60] "Friday the 13th Part III"                               
## [61] "Return of the Living Dead III"

Finding Punctuation

grep("[[:punct:]]", df$Title, value = TRUE, ignore.case = TRUE) %>% head(30)
##  [1] "Code Name: K.O.Z."                                                                 
##  [2] "Superbabies: Baby Geniuses 2"                                                      
##  [3] "Manos: The Hands of Fate"                                                          
##  [4] "Pledge This!"                                                                      
##  [5] "Foodfight!"                                                                        
##  [6] "Birdemic: Shock and Terror"                                                        
##  [7] "Dream.net"                                                                         
##  [8] "Titanic: The Legend Goes On..."                                                    
##  [9] "The Hottie & the Nottie"                                                           
## [10] "Keloglan vs. the Black Prince"                                                     
## [11] "A Fox's Tale"                                                                      
## [12] "Ram Gopal Varma's Indian Flames"                                                   
## [13] "Ben & Arthur"                                                                      
## [14] "Birdemic 2: The Resurrection"                                                      
## [15] "Monster a-Go Go"                                                                   
## [16] "Who's Your Caddy?"                                                                 
## [17] "Anne B. Real"                                                                      
## [18] "The Incredibly Strange Creatures Who Stopped Living and Became Mixed-Up Zombies!!?"
## [19] "Lawnmower Man 2: Beyond Cyberspace"                                                
## [20] "Dark Harvest II: The Maize"                                                        
## [21] "Boggy Creek II: And the Legend Continues"                                          
## [22] "Car 54, Where Are You?"                                                            
## [23] "Dragonball: Evolution"                                                             
## [24] "It's Pat: The Movie"                                                               
## [25] "Ghosts Can't Do It"                                                                
## [26] "The Godfather: Part II"                                                            
## [27] "Schindler's List"                                                                  
## [28] "The Lord of the Rings: The Return of the King"                                     
## [29] "The Lord of the Rings: The Fellowship of the Ring"                                 
## [30] "Star Wars: Episode V - The Empire Strikes Back"
# Find questions
grep("*\\?", df$Title, value = TRUE, ignore.case = TRUE)
##  [1] "Who's Your Caddy?"                                                                 
##  [2] "The Incredibly Strange Creatures Who Stopped Living and Became Mixed-Up Zombies!!?"
##  [3] "Car 54, Where Are You?"                                                            
##  [4] "What Ever Happened to Baby Jane?"                                                  
##  [5] "Who's Afraid of Virginia Woolf?"                                                   
##  [6] "Did You Hear About the Morgans?"                                                   
##  [7] "What Planet Are You From?"                                                         
##  [8] "What's the Worst That Could Happen?"                                               
##  [9] "O Brother, Where Art Thou?"                                                        
## [10] "Are We There Yet?"                                                                 
## [11] "What's Your Number?"                                                               
## [12] "Why Did I Get Married?"                                                            
## [13] "Dude, Where's My Car?"                                                             
## [14] "When Did You Last See Your Father?"                                                
## [15] "Atlas Shrugged: Who Is John Galt?"                                                 
## [16] "What the #$*! Do We (K)now!?"                                                      
## [17] "Do You Believe?"                                                                   
## [18] "Who Killed the Electric Car?"                                                      
## [19] "Dude, Where's My Dog?!"
# Find questions
grep("\\?$", df$Title, value = TRUE, ignore.case = TRUE)
##  [1] "Who's Your Caddy?"                                                                 
##  [2] "The Incredibly Strange Creatures Who Stopped Living and Became Mixed-Up Zombies!!?"
##  [3] "Car 54, Where Are You?"                                                            
##  [4] "What Ever Happened to Baby Jane?"                                                  
##  [5] "Who's Afraid of Virginia Woolf?"                                                   
##  [6] "Did You Hear About the Morgans?"                                                   
##  [7] "What Planet Are You From?"                                                         
##  [8] "What's the Worst That Could Happen?"                                               
##  [9] "O Brother, Where Art Thou?"                                                        
## [10] "Are We There Yet?"                                                                 
## [11] "What's Your Number?"                                                               
## [12] "Why Did I Get Married?"                                                            
## [13] "Dude, Where's My Car?"                                                             
## [14] "When Did You Last See Your Father?"                                                
## [15] "Atlas Shrugged: Who Is John Galt?"                                                 
## [16] "What the #$*! Do We (K)now!?"                                                      
## [17] "Do You Believe?"                                                                   
## [18] "Who Killed the Electric Car?"

Finding word frequency

# Transform title
title_list <- df$Title %>% str_replace_all("[[:punct:]]", "") %>% strsplit(" ") %>% unlist() %>% str_to_lower() 

# Find frequencies
title_list_freq <- table(title_list) %>% list()

# Convert to data frame
title_df <- data.frame(title_list_freq)
colnames(title_df) <- c("word", "freq")

# View df
title_df %>% head(10)
##    word freq
## 1         77
## 2     $    1
## 3     +    2
## 4     1    6
## 5    10    5
## 6  1000    1
## 7   102    1
## 8  10th    1
## 9    11    1
## 10 1114    1
# Find most frequent words
title_df %>% filter(word != "") %>% arrange(desc(freq)) %>% head(10)
##    word freq
## 1   the 1617
## 2    of  508
## 3     a  197
## 4   and  153
## 5    in  129
## 6     2  112
## 7    to  111
## 8   man   69
## 9  love   59
## 10   on   55
# Find first words
titles <- df$Title %>% strsplit(" ")

length(titles)
## [1] 5273
for ( i in (1:length(titles)) ) {
  if (i == 1) {
    first_words <- c(titles[[i]][1])
  } else {
    first_words <- append(first_words, titles[[i]][1])
  }
}

first_words %>% head(10)
##  [1] "Code"         "Saving"       "Superbabies:" "Daniel"       "Manos:"      
##  [6] "Pledge"       "Turks"        "Foodfight!"   "Birdemic:"    "Dream.net"
# Find first word frequency
first_words_freq <- first_words %>% str_replace_all(pattern = "[:punct:]", replacement = "") %>% unlist() %>% str_to_lower() %>% table()

# Convert to data frame
first_words_df <- data.frame(first_words_freq)
colnames(first_words_df) <- c("word", "freq")

first_words_df %>% filter(word != "") %>% arrange(desc(freq)) %>% head(10)
##        word freq
## 1       the 1010
## 2         a   84
## 3        my   34
## 4         i   31
## 5        in   21
## 6      star   19
## 7  american   18
## 8       all   16
## 9       red   16
## 10    black   15
# Find last words
titles <- df$Title %>% strsplit(" ")

length(titles)
## [1] 5273
for (i in (1:length(titles))) {
  if (i == 1) {
    last_words <- c(titles[[i]][length(titles[[i]])])
  } else {
    last_words <- append(last_words, titles[[i]][length(titles[[i]])])
  }
}

last_words %>% head(10)
##  [1] "K.O.Z."     "Christmas"  "2"          "Zauberer"   "Fate"      
##  [6] "This!"      "Space"      "Foodfight!" "Terror"     "Dream.net"
# Find last word frequency
last_words_freq <- last_words %>% str_replace_all(pattern = "[:punct:]", replacement = "") %>% unlist() %>% str_to_lower() %>% table()

# Convert to data frame
last_words_df <- data.frame(last_words_freq)
colnames(last_words_df) <- c("word", "freq")

last_words_df %>% filter(word != "") %>% arrange(desc(freq)) %>% head(20)
##     word freq
## 1      2   79
## 2    man   42
## 3  movie   38
## 4   love   24
## 5     ii   23
## 6     me   22
## 7  story   22
## 8      3   20
## 9    day   20
## 10  dead   20
## 11  girl   18
## 12   men   17
## 13 house   16
## 14 world   15
## 15   you   15
## 16    it   14
## 17   iii   13
## 18   war   13
## 19   boy   12
## 20  days   12